import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
from warnings import filterwarnings
filterwarnings("ignore")
sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')
D:\anaconda files\lib\site-packages\scipy\__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.26.4
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
data=pd.read_csv("C:\\Users\\laxma\\Downloads\\advertising (1).csv")
data
| Daily Time Spent on Site | Age | Area Income | Daily Internet Usage | Ad Topic Line | City | Male | Country | Timestamp | Clicked on Ad | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 68.95 | 35 | 61833.90 | 256.09 | Cloned 5thgeneration orchestration | Wrightburgh | 0 | Tunisia | 2016-03-27 00:53:11 | 0 |
| 1 | 80.23 | 31 | 68441.85 | 193.77 | Monitored national standardization | West Jodi | 1 | Nauru | 2016-04-04 01:39:02 | 0 |
| 2 | 69.47 | 26 | 59785.94 | 236.50 | Organic bottom-line service-desk | Davidton | 0 | San Marino | 2016-03-13 20:35:42 | 0 |
| 3 | 74.15 | 29 | 54806.18 | 245.89 | Triple-buffered reciprocal time-frame | West Terrifurt | 1 | Italy | 2016-01-10 02:31:19 | 0 |
| 4 | 68.37 | 35 | 73889.99 | 225.58 | Robust logistical utilization | South Manuel | 0 | Iceland | 2016-06-03 03:36:18 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | 72.97 | 30 | 71384.57 | 208.58 | Fundamental modular algorithm | Duffystad | 1 | Lebanon | 2016-02-11 21:49:00 | 1 |
| 996 | 51.30 | 45 | 67782.17 | 134.42 | Grass-roots cohesive monitoring | New Darlene | 1 | Bosnia and Herzegovina | 2016-04-22 02:07:01 | 1 |
| 997 | 51.63 | 51 | 42415.72 | 120.37 | Expanded intangible solution | South Jessica | 1 | Mongolia | 2016-02-01 17:24:57 | 1 |
| 998 | 55.55 | 19 | 41920.79 | 187.95 | Proactive bandwidth-monitored policy | West Steven | 0 | Guatemala | 2016-03-24 02:35:54 | 0 |
| 999 | 45.01 | 26 | 29875.80 | 178.35 | Virtual 5thgeneration emulation | Ronniemouth | 0 | Brazil | 2016-06-03 21:43:21 | 1 |
1000 rows × 10 columns
data.head()
| Daily Time Spent on Site | Age | Area Income | Daily Internet Usage | Ad Topic Line | City | Male | Country | Timestamp | Clicked on Ad | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 68.95 | 35 | 61833.90 | 256.09 | Cloned 5thgeneration orchestration | Wrightburgh | 0 | Tunisia | 2016-03-27 00:53:11 | 0 |
| 1 | 80.23 | 31 | 68441.85 | 193.77 | Monitored national standardization | West Jodi | 1 | Nauru | 2016-04-04 01:39:02 | 0 |
| 2 | 69.47 | 26 | 59785.94 | 236.50 | Organic bottom-line service-desk | Davidton | 0 | San Marino | 2016-03-13 20:35:42 | 0 |
| 3 | 74.15 | 29 | 54806.18 | 245.89 | Triple-buffered reciprocal time-frame | West Terrifurt | 1 | Italy | 2016-01-10 02:31:19 | 0 |
| 4 | 68.37 | 35 | 73889.99 | 225.58 | Robust logistical utilization | South Manuel | 0 | Iceland | 2016-06-03 03:36:18 | 0 |
data.tail()
| Daily Time Spent on Site | Age | Area Income | Daily Internet Usage | Ad Topic Line | City | Male | Country | Timestamp | Clicked on Ad | |
|---|---|---|---|---|---|---|---|---|---|---|
| 995 | 72.97 | 30 | 71384.57 | 208.58 | Fundamental modular algorithm | Duffystad | 1 | Lebanon | 2016-02-11 21:49:00 | 1 |
| 996 | 51.30 | 45 | 67782.17 | 134.42 | Grass-roots cohesive monitoring | New Darlene | 1 | Bosnia and Herzegovina | 2016-04-22 02:07:01 | 1 |
| 997 | 51.63 | 51 | 42415.72 | 120.37 | Expanded intangible solution | South Jessica | 1 | Mongolia | 2016-02-01 17:24:57 | 1 |
| 998 | 55.55 | 19 | 41920.79 | 187.95 | Proactive bandwidth-monitored policy | West Steven | 0 | Guatemala | 2016-03-24 02:35:54 | 0 |
| 999 | 45.01 | 26 | 29875.80 | 178.35 | Virtual 5thgeneration emulation | Ronniemouth | 0 | Brazil | 2016-06-03 21:43:21 | 1 |
data.describe()
| Daily Time Spent on Site | Age | Area Income | Daily Internet Usage | Male | Clicked on Ad | |
|---|---|---|---|---|---|---|
| count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 1000.00000 |
| mean | 65.000200 | 36.009000 | 55000.000080 | 180.000100 | 0.481000 | 0.50000 |
| std | 15.853615 | 8.785562 | 13414.634022 | 43.902339 | 0.499889 | 0.50025 |
| min | 32.600000 | 19.000000 | 13996.500000 | 104.780000 | 0.000000 | 0.00000 |
| 25% | 51.360000 | 29.000000 | 47031.802500 | 138.830000 | 0.000000 | 0.00000 |
| 50% | 68.215000 | 35.000000 | 57012.300000 | 183.130000 | 0.000000 | 0.50000 |
| 75% | 78.547500 | 42.000000 | 65470.635000 | 218.792500 | 1.000000 | 1.00000 |
| max | 91.430000 | 61.000000 | 79484.800000 | 269.960000 | 1.000000 | 1.00000 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Daily Time Spent on Site 1000 non-null float64 1 Age 1000 non-null int64 2 Area Income 1000 non-null float64 3 Daily Internet Usage 1000 non-null float64 4 Ad Topic Line 1000 non-null object 5 City 1000 non-null object 6 Male 1000 non-null int64 7 Country 1000 non-null object 8 Timestamp 1000 non-null object 9 Clicked on Ad 1000 non-null int64 dtypes: float64(3), int64(3), object(4) memory usage: 78.2+ KB
data.isnull().sum()
Daily Time Spent on Site 0 Age 0 Area Income 0 Daily Internet Usage 0 Ad Topic Line 0 City 0 Male 0 Country 0 Timestamp 0 Clicked on Ad 0 dtype: int64
data.duplicated().sum()
0
#VISUALIZATION
plt.bar(data['Age'],data['Area Income'])
plt.xticks(rotation=90)
plt.show()
fig=px.bar(data,x='Country',y='Male',color='Country')
fig.show()
fig=px.violin(data,x='Clicked on Ad',y='Daily Internet Usage',color='Clicked on Ad')
fig.show()
fig=px.bar(data,x='Age',y='Ad Topic Line',color='Ad Topic Line')
fig.show()
plt.bar(data['Clicked on Ad'],data['Daily Time Spent on Site'])
plt.scatter(data['Age'],data['Area Income'],color='red')
plt.xticks(rotation=90)
plt.show()
plt.figure(figsize=(10,4))
sns.countplot(x='Clicked on Ad', data=data, color='cyan')
plt.title('Clicked on Ad')
plt.show()
sns.lineplot(x='Clicked on Ad', y='Age', data=data).set_title('Clicked on Ad by Age')
Text(0.5, 1.0, 'Clicked on Ad by Age')
sns.barplot(data['Male'],data['Age'],color='r')
plt.xticks(rotation=90)
plt.show()
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data, x='Daily Time Spent on Site', y='Age')
plt.title('Daily Time Spent on Site in age')
plt.xlabel('Daily Time Spent on Site')
plt.ylabel('Age')
plt.show()
sns.displot(data["Daily Internet Usage"])
<seaborn.axisgrid.FacetGrid at 0x147bb1143d0>
sns.relplot(x='Clicked on Ad',y='Area Income',data=data)
<seaborn.axisgrid.FacetGrid at 0x147bcb99460>
plt.figure(figsize=(8, 6))
data.Age.hist(bins=data.Age.nunique())
plt.xlabel('Age')
Text(0.5, 0, 'Age')
plt.figure(figsize=(8,6))
sns.jointplot(x=data["Area Income"],y=data.Age)
<seaborn.axisgrid.JointGrid at 0x147bb2456d0>
<Figure size 800x600 with 0 Axes>
plt.figure(figsize=(8,6))
sns.jointplot(x=data["Daily Time Spent on Site"],y=data.Age, kind='kde')
<seaborn.axisgrid.JointGrid at 0x147bb245bb0>
<Figure size 800x600 with 0 Axes>
plt.figure(figsize=(8,6))
sns.jointplot(x=data["Daily Time Spent on Site"],y=data["Daily Internet Usage"])
<seaborn.axisgrid.JointGrid at 0x147bdb9d640>
<Figure size 800x600 with 0 Axes>
sns.pairplot(data)
<seaborn.axisgrid.PairGrid at 0x147bf277910>
#MODEL BUILDING
data['Clicked on Ad'].value_counts()
0 500 1 500 Name: Clicked on Ad, dtype: int64
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
if train:
pred=clf.predict(X_train)
clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
print("Train result:\n ")
print(f"Accuracy score: {accuracy_score(y_train,pred) * 100:.2f}%")
print(f"CLASSIFICATION REPORT:\n{clf_report}")
print(f"confusion matrix:\n{confusion_matrix(y_train,pred)}\n")
elif train==False:
pred=clf.predict(X_test)
clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
print("Test Result:\n")
print(f"Accuracy score: {accuracy_score(y_test,pred) * 100:.2f}%")
print(f"CLASSIFICATION REPORT:\n{clf_report}")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
X = data.drop(['Timestamp', 'Clicked on Ad', 'Ad Topic Line', 'Country', 'City'], axis=1)
y = data['Clicked on Ad']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
num_columns = ['Daily Time Spent on Site','Age','Area Income','Daily Internet Usage','Male']
ct = make_column_transformer(
(MinMaxScaler(), num_columns),
(StandardScaler(), num_columns),
remainder = 'passthrough'
)
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr_clf=LogisticRegression(solver='liblinear')
lr_clf.fit(X_train, y_train)
print_score(lr_clf, X_train, y_train, X_test, y_test, train=True)
print_score(lr_clf, X_train, y_train, X_test, y_test, train=False)
Train result:
Accuracy score: 97.43%
CLASSIFICATION REPORT:
0 1 accuracy macro avg weighted avg
precision 0.964088 0.985207 0.974286 0.974648 0.974527
recall 0.985876 0.962428 0.974286 0.974152 0.974286
f1-score 0.974860 0.973684 0.974286 0.974272 0.974279
support 354.000000 346.000000 0.974286 700.000000 700.000000
confusion matrix:
[[349 5]
[ 13 333]]
Test Result:
Accuracy score: 97.00%
CLASSIFICATION REPORT:
0 1 accuracy macro avg weighted avg
precision 0.959732 0.980132 0.97 0.969932 0.970204
recall 0.979452 0.961039 0.97 0.970246 0.970000
f1-score 0.969492 0.970492 0.97 0.969992 0.970005
support 146.000000 154.000000 0.97 300.000000 300.000000
Confusion Matrix:
[[143 3]
[ 6 148]]